{
 "cells": [
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-01T08:03:13.354231Z",
     "start_time": "2024-09-01T08:03:13.352265Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from graphs import get_nx_graph, get_neighbors, get_2hop_neighbors, verbalize_neighbors_triples_from_graph, verbalize_neighbors_triples_from_triples"
   ],
   "id": "df428f3867617ce1",
   "outputs": [],
   "execution_count": 19
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-01T08:02:46.555070Z",
     "start_time": "2024-09-01T08:02:46.545287Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# load sample data\n",
    "# candidate graph\n",
    "candidate_triples = []\n",
    "STEP01_OUTPUT_FILE = f'output/test/step-01.jsonl'\n",
    "for line in open(STEP01_OUTPUT_FILE, 'r'):\n",
    "    t = json.loads(line)\n",
    "    candidate_triples.append((t['s'], t['p'], t['o']))\n",
    "id_2_concept = {i: c['concept'] for i, c in\n",
    "                pd.read_csv('data/refined_concepts.tsv', sep='|', header=None,\n",
    "                            names=['id', 'concept'], index_col=0).iterrows()}\n",
    "concept_2_id = {c: i for i, c in id_2_concept.items()}\n",
    " \n",
    "relation_def = json.load(open('data/relation_types.json'))\n",
    "relation_types = list(relation_def.keys())\n",
    "relation_2_id = {v: k for k, v in enumerate(relation_types)}\n",
    "    \n",
    "# gold standard annotated graph\n",
    "prerequisite_of_triples = []\n",
    "with open('data/prerequisite-of_graph.tsv', 'r') as f:\n",
    "    for line in f:\n",
    "        s, p, o = line.strip().split('\\t')\n",
    "        prerequisite_of_triples.append((s, p, o))"
   ],
   "id": "693876fda38c4f11",
   "outputs": [],
   "execution_count": 12
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-01T08:02:46.890355Z",
     "start_time": "2024-09-01T08:02:46.887496Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# create nx graph from list of triples\n",
    "graph = get_nx_graph(prerequisite_of_triples, concept_2_id, relation_2_id)"
   ],
   "id": "5d65090b8ae4e6e2",
   "outputs": [],
   "execution_count": 13
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-01T08:02:47.312611Z",
     "start_time": "2024-09-01T08:02:47.310194Z"
    }
   },
   "cell_type": "code",
   "source": [
    "print(get_neighbors(graph, 'named entity recognition', concept_2_id, id_2_concept))\n",
    "print(get_neighbors(graph, 'natural language processing intro', concept_2_id, id_2_concept, mode='outgoing'))\n",
    "print(get_neighbors(graph, 'named entity recognition', concept_2_id, id_2_concept, mode='ingoing'))"
   ],
   "id": "df022cd5fcf66eb6",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['linguistics basics', 'natural language processing intro']\n",
      "['spelling correction', 'word sense disambiguation', 'semantic role labeling', 'chomsky hierarchy', 'named entity recognition', 'shallow parsing', 'grammar checker', 'language identification', 'information extraction', 'dialog systems', 'event detection', 'cky parsing', 'propositional logic', 'automated essay scoring', 'kernels', 'nlp for the humanities', 'semantic parsing', 'shift-reduce parsing', 'knowledge representation', 'entailment', 'machine translation', 'word embedding', 'chinese nlp', 'speech processing', 'discourse analysis', 'parsing', 'regular expressions', 'Sequence to sequence', 'sentence boundary recognition', 'document representation', 'penn treebank', 'lexicography', 'text generation', 'bio text mining', 'recommendation system', 'morphology and lexicon', 'edit distance', 'context free grammars', 'probabilistic context free grammars', 'graph-based nlp', 'sentence simplification', 'relation extraction', 'course introduction', 'vector representations', 'feature selection', 'nlp and vision', 'tokenization', 'language modeling', 'text mining', 'semantic similarity', 'noisy channel model', 'query expansion', 'word segmentation', 'text to speech generation']\n",
      "['linguistics basics', 'natural language processing intro']\n"
     ]
    }
   ],
   "execution_count": 14
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-01T08:02:51.438410Z",
     "start_time": "2024-09-01T08:02:51.435798Z"
    }
   },
   "cell_type": "code",
   "source": "get_2hop_neighbors(graph, 'named entity recognition', concept_2_id, id_2_concept)",
   "id": "8d3a4e86daf07eff",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['spelling correction',\n",
       " 'word sense disambiguation',\n",
       " 'semantic role labeling',\n",
       " 'chomsky hierarchy',\n",
       " 'shallow parsing',\n",
       " 'grammar checker',\n",
       " 'language identification',\n",
       " 'information extraction',\n",
       " 'dialog systems',\n",
       " 'event detection',\n",
       " 'cky parsing',\n",
       " 'propositional logic',\n",
       " 'automated essay scoring',\n",
       " 'kernels',\n",
       " 'nlp for the humanities',\n",
       " 'semantic parsing',\n",
       " 'shift-reduce parsing',\n",
       " 'knowledge representation',\n",
       " 'entailment',\n",
       " 'computational phonology',\n",
       " 'chinese nlp',\n",
       " 'speech processing',\n",
       " 'discourse analysis',\n",
       " 'prosody',\n",
       " 'Sequence to sequence',\n",
       " 'sentence simplification',\n",
       " 'tokenization',\n",
       " 'machine translation',\n",
       " 'word embedding',\n",
       " 'parsing',\n",
       " 'regular expressions',\n",
       " 'sentence boundary recognition',\n",
       " 'document representation',\n",
       " 'penn treebank',\n",
       " 'lexicography',\n",
       " 'text generation',\n",
       " 'bio text mining',\n",
       " 'recommendation system',\n",
       " 'morphology and lexicon',\n",
       " 'edit distance',\n",
       " 'context free grammars',\n",
       " 'probabilistic context free grammars',\n",
       " 'graph-based nlp',\n",
       " 'speech synthesis',\n",
       " 'relation extraction',\n",
       " 'course introduction',\n",
       " 'phonetics',\n",
       " 'vector representations',\n",
       " 'feature selection',\n",
       " 'nlp and vision',\n",
       " 'language modeling',\n",
       " 'text mining',\n",
       " 'semantic similarity',\n",
       " 'noisy channel model',\n",
       " 'query expansion',\n",
       " 'word segmentation',\n",
       " 'text to speech generation']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 16
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-01T08:03:34.021973Z",
     "start_time": "2024-09-01T08:03:34.019754Z"
    }
   },
   "cell_type": "code",
   "source": [
    "print(verbalize_neighbors_triples_from_graph(graph, 'natural language processing intro', concept_2_id, id_2_concept, mode='outgoing'))\n",
    "print(verbalize_neighbors_triples_from_graph(graph, 'named entity recognition', concept_2_id, id_2_concept))"
   ],
   "id": "f45077f3fa321907",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(natural language processing intro,Is-a-Prerequisite-of,spelling correction)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,word sense disambiguation)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,semantic role labeling)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,chomsky hierarchy)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,named entity recognition)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,shallow parsing)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,grammar checker)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,language identification)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,information extraction)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,dialog systems)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,event detection)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,cky parsing)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,propositional logic)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,automated essay scoring)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,kernels)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,nlp for the humanities)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,semantic parsing)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,shift-reduce parsing)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,knowledge representation)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,entailment)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,machine translation)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,word embedding)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,chinese nlp)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,speech processing)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,discourse analysis)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,parsing)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,regular expressions)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,Sequence to sequence)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,sentence boundary recognition)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,document representation)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,penn treebank)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,lexicography)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,text generation)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,bio text mining)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,recommendation system)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,morphology and lexicon)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,edit distance)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,context free grammars)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,probabilistic context free grammars)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,graph-based nlp)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,sentence simplification)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,relation extraction)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,course introduction)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,vector representations)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,feature selection)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,nlp and vision)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,tokenization)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,language modeling)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,text mining)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,semantic similarity)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,noisy channel model)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,query expansion)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,word segmentation)\n",
      "(natural language processing intro,Is-a-Prerequisite-of,text to speech generation)\n",
      "\n",
      "None\n"
     ]
    }
   ],
   "execution_count": 21
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-01T08:03:25.223337Z",
     "start_time": "2024-09-01T08:03:25.220494Z"
    }
   },
   "cell_type": "code",
   "source": [
    "concept_name = 'OCR post-correction'\n",
    "verbalize_neighbors_triples_from_triples(candidate_triples, concept_name)"
   ],
   "id": "ee9288d3ccef55a3",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'(OCR post-correction,Compare,spelling correction)\\n(OCR post-correction,Part-of,sequence-to-sequence model)\\n'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 20
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
